Install, load libraries, and create directories
install.packages("kableExtra")
library(tidyverse)
library(plotly)
library(ggpubr)
library(kableExtra)
Import the gapminder dataset and graph CO2 emissions for 1962
#Import dataset
gapminder_clean <- read_csv(paste0(getwd(), "/gapminder_clean.csv"))
#Graph CO2 emissions vs gdp for 1962
plotly::ggplotly(
gapminder_clean %>%
filter(Year==1962) %>%
ggplot(aes(`CO2 emissions (metric tons per capita)`, gdpPercap)) +
geom_point() + geom_smooth() + ggtitle("CO2 emission vs GDP") + theme(plot.title = element_text(hjust = 0.5))
)
# dev.off()
Correlate CO2 levels with the GDP
#Correlate CO2 vs gdpPerCap for 1962
gapminder_clean %>%
filter(Year==1962) %>%
summarise(correlation = stats::cor.test(gapminder_clean$`CO2 emissions (metric tons per capita)`, gapminder_clean$gdpPercap)$estimate, pval = stats::cor.test(gapminder_clean$`CO2 emissions (metric tons per capita)`, gapminder_clean$gdpPercap)$p.value) %>%
mutate(
correlation = formatC(correlation, format = NULL, digits = 3),
pval = formatC(pval, format = "e", digits = 2) # Adjust digits as needed
) %>%
kbl(caption = "<center><strong>Correlation (CO2 vs GDP), 1962<center>", align = 'c', caption.align = 'c') %>%
kable_classic(html_font = "Cambria")
Correlation (CO2 vs GDP), 1962
|
correlation
|
pval
|
|
0.813
|
2.93e-280
|
# correlation pval
# <dbl> <dbl>
# 1 0.813 2.93e-280
#Calculate correlation for the CO2 vs gdpPerCap for all years and find the highest year
gapminder_clean %>%
group_by(Year) %>%
filter(!is.na(`CO2 emissions (metric tons per capita)`) & !is.na(gdpPercap)) %>%
summarize(
correlation = cor.test(`CO2 emissions (metric tons per capita)`, gdpPercap)$estimate,
pval = cor.test(`CO2 emissions (metric tons per capita)`, gdpPercap)$p.value
) %>%
mutate(
correlation = formatC(correlation, format = NULL, digits = 3),
pval = formatC(pval, format = "e", digits = 2) # Adjust digits as needed
) %>%
arrange(desc(correlation)) %>%
slice_head(n=5)%>%
kbl(caption = "<center><strong>Correlation(CO2 vs GDP)<center>", align = 'c', caption.align = 'center') %>%
kable_classic(html_font = "Cambria")
Correlation(CO2 vs GDP)
|
Year
|
correlation
|
pval
|
|
1967
|
0.939
|
3.40e-53
|
|
1962
|
0.926
|
1.13e-46
|
|
1972
|
0.843
|
1.82e-32
|
|
1982
|
0.817
|
5.57e-29
|
|
1987
|
0.81
|
3.90e-28
|
# Year correlation pval
# <dbl> <dbl> <dbl>
# 1 1967 0.939 3.40e-53
What is the relationship between continent and ‘Energy use (kg of
oil equivalent per capita)’? (stats test needed)
plotly::ggplotly(
gapminder_clean %>%
filter(!is.na(`Energy use (kg of oil equivalent per capita)`) & !is.na(continent)) %>%
rename(Energy_use = `Energy use (kg of oil equivalent per capita)`) %>%
ggboxplot(x = "continent", y = "Energy_use", color = "continent") + theme(height = 3, width = 2) +
stat_compare_means(method = "anova", , label.y = 17000, label.x = 3) +
stat_compare_means(label = "p.signif", method = "t.test",
ref.group = ".all.", label.y = 16000)
)
## There is a statistically significant relationship between continent and energy use
Is there a significant difference between Europe and Asia with
respect to ‘Imports of goods and services (% of GDP)’ in the years after
1990? (stats test needed)
plotly::ggplotly(
gapminder_clean %>%
filter(Year > 1990, continent %in% c("Asia", "Europe"), !is.na(`Imports of goods and services (% of GDP)`) & !is.na(continent)) %>%
group_by(continent) %>%
rename(Imports_goods_services_of_GDP = `Imports of goods and services (% of GDP)`) %>%
ggboxplot(x = "continent", y = "Imports_goods_services_of_GDP", color = "continent", add = "jitter", palette = "jco", legend = "none") +
stat_compare_means(method = "t.test", label.y = 210) +
stat_compare_means(label = "p.signif", method = "t.test",
ref.group = "Asia", label.y = 190)
)
##No significant difference in Imports between Asia and Europe
###What is the country (or countries) that has the highest
‘Population density (people per sq. km of land area)’ across all years?
(i.e., which country has the highest average ranking in this category
across each time point in the dataset?)
plotly::ggplotly(
gapminder_clean %>%
group_by(Year) %>%
select(Year, `Country Name`, `Population density (people per sq. km of land area)`) %>%
filter(!is.na(`Population density (people per sq. km of land area)`) & !is.na(`Country Name`)) %>%
arrange(Year, desc(`Population density (people per sq. km of land area)`)) %>%
slice_max(n=1, order_by = `Population density (people per sq. km of land area)`) %>%
ungroup() %>%
ggplot(aes(x = `Country Name`, fill= `Country Name`)) +
geom_bar() + ggtitle("Population Density counts(1962-2007)") +
ylab("Highest ranked(count)") +
xlab("Country") +
theme_classic(base_size = 12) +
scale_y_continuous(expand=c(0, 0), limits=c(0, NA)) +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
)
#`Country Name` n
# <chr> <int>
# 1 Macao SAR, China 5
# 2 Monaco 5
What country (or countries) has shown the greatest increase in ‘Life
expectancy at birth, total (years)’ between 1962 and 2007?
plotly::ggplotly(
gapminder_clean %>%
select(Year, `Country Name`, `Life expectancy at birth, total (years)`) %>%
filter(!is.na(`Life expectancy at birth, total (years)`) & !is.na(`Country Name`), Year %in% c(1962, 2007)) %>%
group_by(`Country Name`) %>%
arrange(Year) %>%
summarise(Life_exp_change = (`Life expectancy at birth, total (years)`[Year==2007]) - (`Life expectancy at birth, total (years)`[Year==1962]))%>%
filter(!is.na(Life_exp_change)) %>%
arrange(desc(Life_exp_change)) %>%
ungroup() %>%
slice_head(n=10) %>%
ggplot(aes(reorder(x = `Country Name`,-Life_exp_change), y = Life_exp_change, fill= `Country Name`)) +
geom_col() + ggtitle("Life Expectancy(1962-2007)") +
ylab("Increase in life expectancy(years)") +
xlab("Country") +
theme_classic(base_size = 14) +
scale_y_continuous(expand=c(0, 0), limits=c(0, NA)) +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.5))
)
#Maldives